#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("corrplot")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.92 loaded
For the data preprocessing part, we will do three things for now: 1. Load the Dataset 2. Check for Missing Values 3. Check Variable Datatypes & Typecast if necessary
#setwd = "C:\\Users\\YASH KATTIMANI\\Downloads"
df = read.csv("newexercise_dataset.csv")
str(df)
## 'data.frame': 3864 obs. of 12 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Exercise : chr "Aerobics" "Strength Training" "High Intensity Training" "Weight Training" ...
## $ Calories.Burn : num 287 343 261 127 416 ...
## $ Dream.Weight : num 91.9 64.2 70.8 79.5 90 ...
## $ Actual.Weight : num 96.3 61.1 71.8 83 85.6 ...
## $ Age : int 45 25 20 33 29 60 18 42 49 41 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Duration : int 37 43 20 39 34 41 53 25 37 55 ...
## $ Heart.Rate : int 170 142 148 170 118 169 103 104 161 103 ...
## $ BMI : num 29.4 21.3 27.9 33.7 23.3 ...
## $ Weather.Conditions: chr "Rainy" "Rainy" "Cloudy" "Sunny" ...
## $ Exercise.Intensity: int 5 5 4 10 3 10 10 2 1 10 ...
null_values <- sum(is.na(df))
print(null_values)
## [1] 0
Turns out there are 0 null values, so we can go ahead without having to worry about this aspect.
#converting exercise from char to factor
df$Exercise <- as.factor(df$Exercise)
df$Gender <- as.factor(df$Gender)
df$Weather.Conditions <- as.factor(df$Weather.Conditions)
Frequency of different Exercises
library(ggplot2)
exercise_count <- ggplot(df, aes(x= Exercise, fill = Exercise)) +
geom_bar() +
labs(title ="Distribution of Exercise", x = "Exercise", y = "Frequency of Occurrence") +
theme(axis.text.x = element_text(angle = 75, hjust = 1)) +
scale_fill_manual(values = rainbow(length(unique(df$Exercise))))
print(exercise_count)
Limited Engagement in Aerobics: The frequency of “Aerobics” is notably low compared to other types of exercise. This suggests either lack interest in or do not prioritizing aerobic exercises.
Basic Workout Anomaly: Despite its generic label, “Basic Workout” has a surprisingly high frequency. This indicates people are engaged in simpler forms of exercise, or it might be a catch-all category for various other exercises not specified.
Classification of BMI values to 4 different categories
# Creating a new variable BMI.Classification based on BMI values
df$BMI.Classification <- ifelse(df$BMI < 18.5, "Underweight",
ifelse(df$BMI < 25, "Healthy weight",
ifelse(df$BMI < 30, "Overweight", "Obesity")))
head(df)
## ID Exercise Calories.Burn Dream.Weight Actual.Weight Age
## 1 1 Aerobics 286.9599 91.89253 96.30112 45
## 2 2 Strength Training 343.4530 64.16510 61.10467 25
## 3 3 High Intensity Training 261.2235 70.84622 71.76672 20
## 4 4 Weight Training 127.1839 79.47701 82.98446 33
## 5 5 Basic Workout 416.3184 89.96023 85.64317 29
## 6 6 Yoga 479.7227 78.88758 80.59659 60
## Gender Duration Heart.Rate BMI Weather.Conditions Exercise.Intensity
## 1 Male 37 170 29.42627 Rainy 5
## 2 Male 43 142 21.28635 Rainy 5
## 3 Male 20 148 27.89959 Cloudy 4
## 4 Male 39 170 33.72955 Sunny 10
## 5 Female 34 118 23.28611 Cloudy 3
## 6 Female 41 169 34.71934 Rainy 10
## BMI.Classification
## 1 Overweight
## 2 Healthy weight
## 3 Overweight
## 4 Obesity
## 5 Healthy weight
## 6 Obesity
Classification of Levels of Intensity based on Intensity values
# Creating a new variable 'Intensity' based on Exercise.Intensity values
df$Intensity <- ifelse(df$Exercise.Intensity < 3, "Low Intensity",
ifelse(df$Exercise.Intensity < 6, "Medium Intensity", "High Intensity"))
# View the first few rows of the updated dataframe to verify the new variable
head(df)
## ID Exercise Calories.Burn Dream.Weight Actual.Weight Age
## 1 1 Aerobics 286.9599 91.89253 96.30112 45
## 2 2 Strength Training 343.4530 64.16510 61.10467 25
## 3 3 High Intensity Training 261.2235 70.84622 71.76672 20
## 4 4 Weight Training 127.1839 79.47701 82.98446 33
## 5 5 Basic Workout 416.3184 89.96023 85.64317 29
## 6 6 Yoga 479.7227 78.88758 80.59659 60
## Gender Duration Heart.Rate BMI Weather.Conditions Exercise.Intensity
## 1 Male 37 170 29.42627 Rainy 5
## 2 Male 43 142 21.28635 Rainy 5
## 3 Male 20 148 27.89959 Cloudy 4
## 4 Male 39 170 33.72955 Sunny 10
## 5 Female 34 118 23.28611 Cloudy 3
## 6 Female 41 169 34.71934 Rainy 10
## BMI.Classification Intensity
## 1 Overweight Medium Intensity
## 2 Healthy weight Medium Intensity
## 3 Overweight Medium Intensity
## 4 Obesity High Intensity
## 5 Healthy weight Medium Intensity
## 6 Obesity High Intensity
Converting Heart rate to different levels “Low”, “Moderate”, “Elevated”, “High”, “Very High”
breaks <- c(0, 100, 120, 140, 160, Inf) # Customizing the breaks requirements
labels <- c("Low", "Moderate", "Elevated", "High", "Very High")
# Use the cut function to categorize Heart Rate into levels
df$HeartRateLevel <- cut(df$Heart.Rate, breaks = breaks, labels = labels, right = FALSE)
# Create a density plot with KDE
density_plot <- ggplot(df, aes(x = Age, fill = Gender, color = Gender)) +
geom_density(alpha = 0.5) +
labs(title = "Age distribution", x = "Age", y = "Density of Activity") +
theme_minimal() +
scale_fill_manual(values = c("blue", "red")) +
scale_color_manual(values = c("blue", "red"))
print(density_plot)
# Load the ggplot2 library
library(ggplot2)
# Create a histogram with ggplot2
hist_plot <- ggplot(df, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
labs(title = "Distribution of Age", x = "Age", y = "Frequency of Age Groups") +
theme_minimal()
# Calculate the percentage of each bin and add percentage labels
hist_plot <- hist_plot +
stat_bin(
binwidth = 5,
geom = "text",
aes(label = scales::percent(..count.. / sum(..count..))),
position = position_stack(vjust = 0.5)
)
# Display the histogram plot
print(hist_plot)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Concentration in Mid-Age Groups: The highest frequency of ages falls within the 25-35 age bracket, suggesting that this dataset predominantly consists of individuals who are in their late 20s to early 30s.
Sparse Older Population: There is a noticeable decline in frequency for age groups above 50, indicating that fewer older individuals are represented in this exercise dataset.
ggplot(df, aes(y=Heart.Rate, x=BMI, color= BMI.Classification)) +
geom_point() +
ggtitle("Heart Rate and BMI vs Exercise Duration for Achieving Dream Weight")
library(ggplot2)
library(ggplot2)
# Create a bar plot
bmi_bar_plot <- ggplot(df, aes(x = BMI.Classification, fill = BMI.Classification)) +
geom_bar() +
labs(title = "BMI Classification", x = "Classification", y = "Number of Individuals") +
scale_fill_manual(values = c(
"Underweight" = "red",
"Healthy weight" = "lightgreen",
"Overweight" = "maroon",
"Obesity" = "orange"
)) +
theme_minimal()
# Calculate and add percentage labels to each bar
bmi_bar_plot <- bmi_bar_plot +
geom_text(
aes(label = scales::percent(..count.. / sum(..count..)),
y = ..count..,
group = BMI.Classification),
stat = "count",
vjust = -0.5, # Adjust the vertical position of labels
hjust = 0.5, # Center the labels horizontally
position = position_stack(vjust = 0.5)
)
# Display the bar plot
print(bmi_bar_plot)
# Calculating the distribution of BMI classifications
bmi_class_counts <- table(df$BMI.Classification)
# Calculating the percentages
bmi_class_percentages <- prop.table(bmi_class_counts) * 100
# Create a pie chart
pie(bmi_class_counts, labels = paste(names(bmi_class_counts), "(", round(bmi_class_percentages, 1), "%)"), col = c("red", "green", "blue"))
title("Distribution by BMI Classification")
# Create a bar plot with brighter, distinct colors
bar_plot <- ggplot(df, aes(x = Exercise, fill = BMI.Classification)) +
geom_bar(position = "dodge") +
labs(title = "Exercise by BMI Classification", x = "Exercise", y = "Number of Participants") +
scale_fill_manual(values = c(
"Underweight" = "magenta", # Bright Magenta
"Overweight" = "cyan", # Bright Cyan
"Obesity" = "orange", # Bright Orange
"Healthy weight" = "purple" # Bright Purple
)) +
theme(axis.text.x = element_text(angle = 75, hjust = 1))
# Display the bar plot
print(bar_plot)
Diverse BMI Categories in High-Intensity Training: High Intensity Training appears to attract participants from all BMI classifications, from “Underweight” to “Obesity”, indicating that this form of exercise is popular across different health statuses.
Limited “Underweight” Participation: There are notably fewer “Underweight” individuals participating in most types of exercise, except for “High Intensity Training”. This could indicate a lack of engagement or interest among this particular group in the dataset.
# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)
# Group data and calculate frequency
df_summary <- df %>%
group_by(Exercise, Gender) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(frequency = count * 100 / sum(count))
# Create a grouped bar plot with enhanced colors and reduced transparency
grouped_bar_plot <- ggplot(df_summary, aes(x = Gender, y = frequency, fill = Gender)) +
geom_bar(position = "dodge", stat = "identity", alpha = 1) + # Set alpha to 1 for full opacity
scale_fill_manual(values = c("hotpink", "dodgerblue")) + # More vibrant colors
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
facet_wrap(~Exercise) +
theme_light() +
labs(title = "Gender-Based Participation in Different Forms of Exercise", fill = "Gender", x = "", y = "Percentage of Participation") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
theme(legend.position = "top")
# Display the grouped bar plot
print(grouped_bar_plot)
# Load required libraries
library(dplyr)
library(ggplot2)
library(scales)
# Calculate the count and frequency of BMI classifications by Exercise, Gender
df_summary <- df %>%
group_by(Exercise, Gender, BMI.Classification) %>%
summarise(count = n()) %>%
mutate(frequency = count * 100 / sum(count))
## `summarise()` has grouped output by 'Exercise', 'Gender'. You can override
## using the `.groups` argument.
# Create a grouped bar plot with new colors
grouped_bar_plot <- ggplot(df_summary, aes(x = Gender, y = frequency, fill = BMI.Classification)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(values = c("Overweight" = "gold", "Obesity" = "purple", "Healthy weight" = "turquoise")) + # New colors
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
facet_wrap(~Exercise) +
theme_light() +
labs(title = "Comparative Analysis of Exercise Participation by Gender and BMI Classification", fill = "BMI Classification", x = "", y = "Percentage of Participation") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
theme(legend.position = "top")
# Display the grouped bar plot
print(grouped_bar_plot)
Predominance of “Healthy weight” in Males: In almost all exercise types except for “Strength Training”, the majority of male participants fall under the “Healthy weight” category. This could indicate that males in the “Healthy weight” BMI category are more inclined towards a diverse range of exercises.
Diverse BMI Categories in Females: Unlike males, females show a more balanced distribution across different BMI categories for exercises like “High Intensity Training” and “Strength Training”. This suggests that these exercises appeal to women of various BMI classifications.
library(ggplot2)
ggplot(df, aes(x=Age, y=Calories.Burn, color=Exercise.Intensity)) +
geom_point() +
ggtitle("Influence of Age on Exercise Intensity and Calorie Burn") +
xlab("Age") +
ylab("Calories Burnt") +
theme(panel.background = element_rect(fill = "white"))
ggplot(df, aes(x=Actual.Weight, y=Calories.Burn, color=Exercise.Intensity)) +
geom_point() +
ggtitle("Influence of Actual Weight on Exercise Intensity and Calorie Burn") +
xlab("Actual Weight") +
ylab("Calories Burnt") +
theme(panel.background = element_rect(fill = "white"))
ggplot(df, aes(x=Duration, y=Calories.Burn, color=Exercise.Intensity)) +
geom_point() +
ggtitle("Influence of Duration on Exercise Intensity and Calorie Burn") +
xlab("Duration") +
ylab("Calories Burnt") +
theme(panel.background = element_rect(fill = "white"))
Age and Calorie Burn: There seems to be no strong pattern between age and the calories burned across different exercise intensities. This could suggest that age is not a significant factor in determining how many calories one burns during exercise.
Duration and Calorie Burn: The calories burned seem to increase with the duration of the exercise, irrespective of the exercise intensity. This suggests that the longer one exercises, the more calories they are likely to burn.
Actual Weight and Calorie Burn: Similar to age, the actual weight also doesn’t exhibit a strong trend with the calories burned. Exercise intensity colors are scattered across, indicating no clear relationship.
# Create a scatterplot to visualize the relationship between Age, Duration, and Calories.Burn
library(ggplot2)
scatterplot <- ggplot(df, aes(x = Age, y = Duration, color = Calories.Burn)) +
geom_point() +
labs(title = "Relationship between Age, Duration, and Calorie Burn", x = "Age", y = "Duration") +
scale_color_gradient(low = "blue", high = "red") +
theme_minimal()
print(scatterplot)
# Create a histogram with new colors
exercise_calories <- ggplot(df, aes(x = Intensity, y = Calories.Burn)) +
geom_histogram(stat = 'identity', fill = 'purple', color = 'darkorange') +
labs(title = 'Exercise Intensity vs Calories Burn', x = 'Intensity', y = 'Calories Burn') +
theme_minimal()
## Warning in geom_histogram(stat = "identity", fill = "purple", color =
## "darkorange"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
# Display the histogram
print(exercise_calories)
library(ggplot2)
# Calculating efficiency
df <- transform(df, Efficiency = Calories.Burn / Duration)
# Sorting the data frame by efficiency in descending order
df <- df[order(-df$Efficiency), ]
# Defining a custom color palette
my_colors <- c("Aerobics" = "blue", "Strength Training" = "green", "High Intensity Training" = "yellow",
"Weight Training" = "orange", "Yoga" = "red", "Basic Workout" = "purple", "Cardio" = "pink",
"Leg Workout" = "brown", "Circuit Training" = "gray", "Core Workout" = "cyan")
# Creating a bar plot with updated axis labels
efficiency_plot <- ggplot(df, aes(x = Efficiency, y = reorder(Exercise, -Efficiency))) +
geom_bar(stat = "identity", aes(fill = Exercise)) +
scale_fill_manual(values = my_colors) +
labs(title = "Exercise Efficiency", x = "Efficiency (Calories/Minute)", y = "Types of Exercise") +
theme_minimal()
# Display the bar plot
print(efficiency_plot)
High-Efficiency Activities: High Intensity Training seems to be the most efficient exercise in terms of calories burned per minute.
Low-Efficiency Activities: Yoga and Basic Workout appear to be the least efficient for calorie burn per minute.
# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)
# Calculate the count and frequency of exercises by Exercise, Gender, and Intensity
df_summary <- df %>%
group_by(Exercise, Gender, Intensity) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(frequency = count * 100 / sum(count))
# Create a grouped bar plot with updated axis labels
grouped_bar_plot <- ggplot(df_summary, aes(x = Gender, y = frequency, fill = Intensity)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(values = c("red", "green", "blue", "purple", "orange", "pink", "yellow")) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
facet_wrap(~Exercise) +
theme_light() +
labs(title = "Gender and Intensity Distribution Across Different Exercises", x = "Gender", y = "Percentage of Participants") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
theme(legend.position = "top")
# Display the grouped bar plot
print(grouped_bar_plot)
Intensity Variation: For both genders, the majority of exercise sessions are of “Medium Intensity,” followed by “High Intensity” and “Low Intensity”. This might indicate a general preference for moderate exercise sessions across the population.
Gender Similarities: Both genders seem to follow a similar distribution of exercise intensity, suggesting that the choice of exercise intensity is not significantly influenced by gender.
library(ggplot2)
ggplot(df, aes(x=Exercise, fill=Weather.Conditions)) +
geom_bar() +
ggtitle("Exercises by Weather Conditions") +
labs(y = "Frequency of Exercises") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)
# Calculate the count and frequency of exercises by Exercise, Gender, and Weather Conditions
df_summary <- df %>%
group_by(Exercise, Gender, Weather.Conditions) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(frequency = count * 100 / sum(count))
# Create a grouped bar plot with updated titles and colors
grouped_bar_plot <- ggplot(df_summary, aes(x = Gender, y = frequency, fill = Weather.Conditions)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(values = c("gold", "orange", "purple", "blue", "darkorange", "deeppink", "red", "darkcyan", "darkmagenta", "darkslategray")) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
facet_wrap(~Exercise) +
theme_light() +
labs(title = "Distribution of Exercise Types by Gender and Weather Conditions", x = "", y = "Proportion of Participants (%)") +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
theme(legend.position = "top")
# Display the grouped bar plot
print(grouped_bar_plot)
# Create a big grid of scatter plots with facets for Exercise and Weather Conditions
scatter_grid <- ggplot(df, aes(x = Duration, y = `Exercise.Intensity`)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) + # Add a line of best fit
facet_grid(Exercise ~ `Weather.Conditions`) +
labs(title = "Relationship between weather and exercise", x = "Duration", y = "Exercise Intensity") +
theme_minimal()+
theme(axis.text.y = element_blank())
# Display the big grid of plots
print(scatter_grid)
## `geom_smooth()` using formula = 'y ~ x'
Intensity and Duration Correlation: There appears to be a weak positive correlation between exercise intensity and duration, regardless of the weather conditions. This could imply that longer workouts are generally more intense.
Weather-Independent: The scatter of points across different weather conditions does not indicate a strong relationship between weather and either exercise duration or intensity.
library(ggplot2)
# Create the bar plot
bar_plot <- ggplot(df, aes(x = Exercise, fill = Gender)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 75, hjust = 1)) +
ggtitle("Exercise Variety by Gender") +
labs(y = "Frequency of Participants")
# Display the plot
print(bar_plot)
Males predominantly engage in Strength Training and Basic Workout, whereas females show a preference for Yoga and Aerobics.
There’s gender-based diversification in exercise types; for instance, more females engage in Yoga, while more males are into Strength Training.
library(ggplot2)
library(viridis) # Load the viridis package
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
# Create the box plot with the x and y axes interchanged
box_plot <- ggplot(df, aes(x = Exercise, y = BMI, fill = Gender)) +
geom_boxplot() +
ggtitle("Boxplot of BMI by Exercise Type and Gender") +
theme(axis.text.x = element_text(angle = 75, hjust =1),panel.background = element_rect(fill = "white"),
plot.background = element_rect(fill = "white")) +
scale_fill_viridis(discrete = TRUE)
# Display the plot
print(box_plot)
The median BMI for most exercise types lies in the “Healthy Weight” range, but it varies by gender.
Yoga and Aerobics attract participants with a lower range of BMI, indicating these exercises might be popular among individuals focusing on weight management.
# Load ggplot2 library
library(ggplot2)
# Create the bar plot for exercise preferences by duration
ggplot(df, aes(x=factor(Duration), fill=Exercise)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 75, hjust =1)) +
ggtitle("Exercise Preferences by Duration") +
labs(y = "Count of Exercise Sessions")
Short-duration exercises like High Intensity Training and Core Workout are popular, indicating people’s preference for quick, intense workouts.
Long-duration exercises are comparatively less popular, potentially due to time constraints or the intensity level required.
# Create a new variable for the difference between Actual and Dream Weight
df$Weight.Diff = abs(df$Actual.Weight - df$Dream.Weight)
df$Weight.Diff.Bin = cut(df$Weight.Diff, breaks = c(0, 5, 10, 15, Inf), labels = c("0-5", "6-10", "11-15", "15+"))
# Create the scatter plot
ggplot(df, aes(x=Actual.Weight, y=Dream.Weight, color=Weight.Diff.Bin)) +
geom_point() +
scale_color_manual(values = c("0-5" = "blue", "6-10" = "green", "11-15" = "yellow", "15+" = "red")) +
ggtitle("Actual Weight vs Dream Weight")
1. Data Filtering First, lets remove any missing values for ‘Weather Conditions’ and ‘Calories Burn’. But turns out there are no missing values in our data as we checked earlier. So we can proceed to the next step.
2. Data Grouping First, lets remove any missing values for ‘Weather Conditions’ and ‘Calories Burn’. But turns out there are no missing values in our data as we checked earlier. So we can proceed to the next step.
Solution:
Defining the Hypothesis: H0: The mean ‘Calories Burn’ is the same for all ‘BMI Classification’ groups. H1: At least one ‘BMI Classification’ group has a different mean ‘Calories Burn’.
# One-Way ANOVA
anova_result <- aov(Calories.Burn ~ BMI.Classification, data = df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## BMI.Classification 2 18147 9074 0.676 0.509
## Residuals 3861 51808886 13419
# Directly display the ANOVA summary to check p-value
anova_summary <- summary(anova_result)
print(anova_summary)
## Df Sum Sq Mean Sq F value Pr(>F)
## BMI.Classification 2 18147 9074 0.676 0.509
## Residuals 3861 51808886 13419
# Tukey HSD Test for Post-Hoc Analysis
posthoc <- TukeyHSD(anova_result)
print(posthoc)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Calories.Burn ~ BMI.Classification, data = df)
##
## $BMI.Classification
## diff lwr upr p adj
## Obesity-Healthy weight 3.8698285 -6.680923 14.42058 0.6656105
## Overweight-Healthy weight 4.8684980 -5.699947 15.43694 0.5265074
## Overweight-Obesity 0.9986695 -10.204163 12.20150 0.9762055
ANOVA: The p-value from the ANOVA test is 0.509, which is greater than 0.05. This means that we fail to reject the null hypothesis (H0), suggesting there is no significant difference in ‘Calories Burn’ across the different ‘BMI Classifications’ in out dataset.
Post-Hoc Test: The Tukey HSD test also supports this result. All pairwise comparisons have adjusted p-values greater than 0.05, confirming that the mean ‘Calories Burn’ is not significantly different between any two ‘BMI Classification’ groups.
ggplot(df, aes(x=BMI.Classification, y=Calories.Burn, fill=BMI.Classification)) +
geom_boxplot() +
scale_fill_brewer(palette="Set1") +
theme_minimal() +
ggtitle("Boxplot of 'Calories Burn' by 'BMI Classification'") +
xlab("BMI Classification") +
ylab("Calories Burn")
ggplot(df, aes(x=BMI.Classification, y=Calories.Burn, fill=BMI.Classification)) +
geom_violin() +
scale_fill_brewer(palette="Set3") +
theme_minimal() +
ggtitle("Violinplot of 'Calories Burn' by 'BMI Classification'") +
xlab("BMI Classification") +
ylab("Calories Burn")
ggplot(df, aes(x=BMI.Classification, y=Calories.Burn)) +
geom_boxplot(aes(fill=BMI.Classification), alpha=0.5) +
geom_jitter(aes(color=BMI.Classification), width=0.2) +
ggtitle("Boxplot and Swarmplot of 'Calories Burn' by 'BMI Classification'") +
xlab("BMI Classification") +
ylab("Calories Burn")
Boxplot: The boxplot shows the median, quartiles, and outliers for ‘Calories Burn’ for each ‘BMI Classification’. The overlap of the interquartile ranges suggests that there’s no significant difference in ‘Calories Burn’ across the groups.
Violinplot: The violinplot combines a boxplot with a kernel density estimation. It provides more information about the density of the data points at different calorie levels. Again, the shapes of the “violins” are quite similar for all categories, indicating no significant difference in ‘Calories Burn’.
Boxplot and Swarmplot: The swarmplot overlay on the boxplot displays individual data points, sampled to avoid clutter. This provides a sense of the data distribution while still summarizing it with a boxplot. Once more, the distribution of points appears quite similar across the ‘BMI Classification’ groups.
# Load the required library
library(dplyr)
# Apply Shapiro-Wilk test for each group
df %>%
group_by(Weather.Conditions, Exercise.Intensity) %>%
summarise(p_value = shapiro.test(Heart.Rate)$p.value)
## `summarise()` has grouped output by 'Weather.Conditions'. You can override
## using the `.groups` argument.
## # A tibble: 30 × 3
## # Groups: Weather.Conditions [3]
## Weather.Conditions Exercise.Intensity p_value
## <fct> <int> <dbl>
## 1 Cloudy 1 0.00000972
## 2 Cloudy 2 0.0000382
## 3 Cloudy 3 0.0000691
## 4 Cloudy 4 0.0000137
## 5 Cloudy 5 0.0000462
## 6 Cloudy 6 0.0000475
## 7 Cloudy 7 0.000246
## 8 Cloudy 8 0.000326
## 9 Cloudy 9 0.00000555
## 10 Cloudy 10 0.00000358
## # ℹ 20 more rows
# Load the required library
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Convert Exercise.Intensity to factor
df$Exercise.Intensity <- as.factor(df$Exercise.Intensity)
# Apply Levene's test
leveneTest(Heart.Rate ~ Weather.Conditions * Exercise.Intensity, data=df)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 29 0.6468 0.9269
## 3834
Observations:
Building the Hypotheses When running a Two-Way ANOVA, we’ll be testing three sets of null hypotheses:
1.Null Hypothesis for Weather Conditions Effect: There is no effect of weather conditions on heart rate.
2. Null Hypothesis for Exercise Intensity Effect: There is no effect of exercise intensity on heart rate.
3. Null Hypothesis for Interaction Effect: There is no interaction effect between weather conditions and exercise intensity on heart rate.
library(car)
# Ensure that 'Weather.Conditions' and 'Exercise.Intensity' are factors
df$Weather.Conditions <- as.factor(df$Weather.Conditions)
df$Exercise.Intensity <- as.factor(df$Exercise.Intensity)
# Two-Way ANOVA Model
# Testing for main effects and interaction effect
anova_model <- aov(Calories.Burn ~ Exercise * BMI.Classification , data = df)
# Summary of the Model
# This will show the F-value and p-value for each main effect and the interaction term
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Exercise 9 177844 19760 1.478 0.1498
## BMI.Classification 2 17030 8515 0.637 0.5290
## Exercise:BMI.Classification 18 375792 20877 1.562 0.0611 .
## Residuals 3834 51256368 13369
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-hoc Tests (Tukey's HSD) for multiple comparisons, if necessary
# This is relevant if any of the main or interaction effects are significant
tukey_results <- TukeyHSD(anova_model)
#head(tukey_results, n =10)
Two-Way ANOVA Results
Tukey Multiple Comparisons
The Tukey test provides pairwise comparisons, with the differences, lower and upper bounds of the 95% family-wise confidence interval, and adjusted p-values.
Interpretation of the Output
Null Hypothesis for Weather Conditions Effect:
Since the p-value is greater than 0.05, we fail to reject the null hypothesis. This suggests that weather conditions do not have a significant effect on heart rate.
Null Hypothesis for Exercise Intensity Effect:
Again, the p-value is greater than 0.05, indicating that exercise intensity levels do not have a significant effect on heart rate.
Null Hypothesis for Interaction Effect:
The p-value is greater than 0.05, so we fail to reject the null hypothesis. This suggests that there is no interaction effect between weather conditions and exercise intensity on heart rate.
Tukey Test
The Tukey test further confirms these findings. Most of the adjusted p-values are greater than 0.05, indicating that the pairwise differences between the levels of each factor are not statistically significant.
Summary
No significant effect of weather conditions on heart rate. No significant effect of exercise intensity on heart rate. No significant interaction effect between weather conditions and exercise intensity on heart rate. These findings suggest that neither weather conditions nor exercise intensity have a significant impact on heart rate in this study, and there is no interaction effect between the two. This would mean that other variables might be influencing heart rate, which are not captured in the current model.
df$Age <- as.numeric(df$Age)
df$Efficiency <- as.numeric(df$Efficiency)
str(df)
## 'data.frame': 3864 obs. of 18 variables:
## $ ID : int 484 1017 1367 595 1276 1471 1096 2505 296 2049 ...
## $ Exercise : Factor w/ 10 levels "Aerobics","Basic Workout",..: 4 8 2 4 7 8 7 3 5 4 ...
## $ Calories.Burn : num 497 497 497 487 483 ...
## $ Dream.Weight : num 97.5 85.4 72.1 78 77.6 ...
## $ Actual.Weight : num 102.4 90.3 70 77.4 76.6 ...
## $ Age : num 47 27 59 47 45 36 34 37 19 46 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 1 1 1 1 1 1 ...
## $ Duration : int 20 20 20 20 20 21 20 21 20 20 ...
## $ Heart.Rate : int 129 106 114 153 159 147 167 147 112 156 ...
## $ BMI : num 28.7 33.7 18.7 28.7 24.9 ...
## $ Weather.Conditions: Factor w/ 3 levels "Cloudy","Rainy",..: 2 1 1 3 2 2 1 3 2 2 ...
## $ Exercise.Intensity: Factor w/ 10 levels "1","2","3","4",..: 4 6 6 8 10 2 3 7 10 3 ...
## $ BMI.Classification: chr "Overweight" "Obesity" "Healthy weight" "Overweight" ...
## $ Intensity : chr "Medium Intensity" "High Intensity" "High Intensity" "High Intensity" ...
## $ HeartRateLevel : Factor w/ 5 levels "Low","Moderate",..: 3 2 2 4 4 4 5 4 2 4 ...
## $ Efficiency : num 24.9 24.9 24.8 24.4 24.1 ...
## $ Weight.Diff : num 4.864 4.925 2.1 0.594 0.993 ...
## $ Weight.Diff.Bin : Factor w/ 4 levels "0-5","6-10","11-15",..: 1 1 1 1 1 1 1 1 1 1 ...
df$Gender <- as.factor(df$Gender)
df$BMI.Classification <- as.factor(df$BMI.Classification)
df$Weather.Conditions <- as.factor(df$Weather.Conditions)
df$Intensity <- as.factor(df$Intensity)
df$Duration <- as.numeric(df$Duration)
# Identify the numeric columns
numeric_columns <- sapply(df, is.numeric)
# Calculate the correlation matrix for numeric variables
correlation <- cor(df[, numeric_columns], use = "complete.obs")
# Load required library
library(corrplot)
# Create the correlation heatmap with adjusted margins
corrplot(correlation, method = "color", type = "upper", tl.col = "black", tl.srt = 45,
diag = FALSE, addCoef.col = "black", tl.cex = 0.7, mar = c(1,1,2,1))
linear_model <- lm(Calories.Burn ~ Efficiency + Duration, data = df)
# Summarize the model
summary(linear_model)
##
## Call:
## lm(formula = Calories.Burn ~ Efficiency + Duration, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.00 -22.14 11.66 28.25 76.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -233.63177 4.36725 -53.50 <2e-16 ***
## Efficiency 30.99887 0.20834 148.79 <2e-16 ***
## Duration 6.95342 0.07601 91.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.64 on 3861 degrees of freedom
## Multiple R-squared: 0.8516, Adjusted R-squared: 0.8515
## F-statistic: 1.108e+04 on 2 and 3861 DF, p-value: < 2.2e-16
The linear regression model aims to predict “Calories Burn” based on two predictor variables: “Efficiency” and “Duration.” Here are some observations based on the model’s summary:
Model Fit: The model provides a good fit for the data, with an adjusted R-squared value of 0.8515. This indicates that approximately 85.15% of the variance in “Calories Burn” is explained by the predictors, “Efficiency” and “Duration.”
Significant Predictors: Both predictor variables, “Efficiency” and “Duration,” are statistically significant in predicting “Calories Burn.” The p-values associated with these predictors are very close to zero, suggesting a strong relationship between the predictors and the target variable.
Intercept: The intercept term (-233.63370) represents the estimated “Calories Burn” when both “Efficiency” and “Duration” are zero. While this value may not have a practical interpretation in this context, it’s essential for the model’s calculations.
Coefficients: The coefficients for the predictor variables provide insights into the relationship between each predictor and the target variable.
Residuals: The residuals indicate the model’s prediction errors. The residuals are relatively symmetrically distributed around zero, suggesting that the model’s assumptions about errors are reasonably met.
F-Statistic: The F-statistic tests whether the overall model is a good fit for the data. With a very high F-statistic and an associated p-value close to zero, it suggests that the model as a whole is a good fit.
Residual Standard Error: The residual standard error of 44.64 provides a measure of the typical prediction error of the model. Smaller values indicate better model fit.
In summary, this linear regression model is highly significant and explains a substantial portion of the variance in “Calories Burn.” Both “Efficiency” and “Duration” are strong predictors of calorie burn, with efficiency having the most substantial effect.
Summary:
The linear regression model is a robust predictor for “Calories Burn,” accounting for approximately 85.15% of the variance in the target variable.The model’s high F-statistic and near-zero p-value corroborate the overall fitness and significance of the model. The residual standard error of 44.64 suggests that the model’s predictions are generally within this range of the actual observations, indicating a reasonably good fit. The coefficients suggest that a one-unit increase in “Efficiency” will result in burning approximately 31 additional calories, while a one-minute increase in “Duration” will lead to around 7 extra calories burned.
The project successfully conducted a comprehensive analysis of exercise habits through data visualization and statistical testing. The key findings suggest that neither BMI nor weather conditions significantly affect the number of calories burned during exercise, debunking some commonly held beliefs. However, the limitations of the dataset indicate that there might be other unmeasured variables that could provide further insights.
Statistical Findings:
One-Way ANOVA: No significant difference was observed in calories burned across different BMI classifications. Two-Way ANOVA: Neither weather conditions nor exercise intensity had a significant impact on the number of calories burned.
Based on these results, we can conclude that our initial hypothesis, which was to find out if there is a significant difference in ‘Calories Burn’ across ‘BMI Classification’ groups, is not supported by the data. Both the ANOVA and the post-hoc Tukey HSD test indicate that there are no significant differences in the ‘Calories Burn’ among the different ‘BMI Classification’ categories. All these visualizations reinforce the ANOVA findings that there is no significant difference in the ‘Calories Burn’ across different ‘BMI Classification’ categories. Furthermore, the from the linear regression model we look at two different predictor variables, “Efficiency” and “Duration,” are statistically significant, with p-values close to zero, underscoring their importance in predicting calorie burn.
Limitations: Data Completeness: No missing values were identified, but the dataset might not capture all factors affecting exercise habits. Sample Bias: The sample may not be fully representative of the general population.
Conclusion: The project successfully conducted a comprehensive analysis of exercise habits through data visualization and statistical testing. The key findings suggest that neither BMI nor weather conditions significantly affect the number of calories burned during exercise, debunking some commonly held beliefs. However, the limitations of the dataset indicate that there might be other unmeasured variables that could provide further insights.
Future Work: -